In continuation from Session 1
Arabic <- fromJSON("arabicAyahsSimple.json")
# DATA CLEANUP
Arabic<-Arabic$ayahs
Arabic$id<-NULL
Arabic$edition_id<-NULL
Arabic$page_id<-NULL
Arabic$hizbQuarter_id<-NULL
#Arabic$id<-NULL
str(Arabic)
## 'data.frame': 6236 obs. of 8 variables:
## $ surat_id : int 1 1 1 1 1 1 1 2 2 2 ...
## $ juz_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ text : chr "بسم الله الرحمن الرحيم" "الحمد لله رب العالمين" "الرحمن الرحيم" "مالك يوم الدين" ...
## $ numberinsurat: int 1 2 3 4 5 6 7 1 2 3 ...
## $ manzil_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ruku_id : int 1 1 1 1 1 1 1 2 2 2 ...
## $ sajda_id : int NA NA NA NA NA NA NA NA NA NA ...
Arabic$text[1]
## [1] "بسم الله الرحمن الرحيم"
nchar(Arabic$text[1])
## [1] 23
Arabic$chars<-nchar(Arabic$text)
str(Arabic)
## 'data.frame': 6236 obs. of 9 variables:
## $ surat_id : int 1 1 1 1 1 1 1 2 2 2 ...
## $ juz_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ text : chr "بسم الله الرحمن الرحيم" "الحمد لله رب العالمين" "الرحمن الرحيم" "مالك يوم الدين" ...
## $ numberinsurat: int 1 2 3 4 5 6 7 1 2 3 ...
## $ manzil_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ruku_id : int 1 1 1 1 1 1 1 2 2 2 ...
## $ sajda_id : int NA NA NA NA NA NA NA NA NA NA ...
## $ chars : int 23 21 13 14 22 21 52 26 37 54 ...
summary(Arabic)
## surat_id juz_id number text
## Min. : 1.00 Min. : 1.00 Min. : 1 Length:6236
## 1st Qu.: 11.00 1st Qu.:12.00 1st Qu.:1560 Class :character
## Median : 26.00 Median :19.00 Median :3118 Mode :character
## Mean : 33.52 Mean :18.48 Mean :3118
## 3rd Qu.: 51.00 3rd Qu.:26.00 3rd Qu.:4677
## Max. :114.00 Max. :30.00 Max. :6236
##
## numberinsurat manzil_id ruku_id sajda_id
## Min. : 1.00 Min. :1.000 Min. : 1.0 Min. : 1.0
## 1st Qu.: 16.00 1st Qu.:3.000 1st Qu.:190.0 1st Qu.: 4.5
## Median : 38.00 Median :5.000 Median :325.0 Median : 8.0
## Mean : 53.51 Mean :4.529 Mean :313.2 Mean : 8.0
## 3rd Qu.: 75.00 3rd Qu.:7.000 3rd Qu.:455.0 3rd Qu.:11.5
## Max. :286.00 Max. :7.000 Max. :556.0 Max. :15.0
## NA's :6221
## chars
## Min. : 3.00
## 1st Qu.: 29.00
## Median : 54.00
## Mean : 66.39
## 3rd Qu.: 87.00
## Max. :711.00
##
ggplot(data = Arabic, aes(x=surat_id, y = chars,color=manzil_id)) +
geom_point(alpha=0.3)+
#geom_line(alpha=0.3)+
#facet_wrap(facets = vars(manzil_id))+
#labs(title = "All Ayahs",
# x = "Ayat No",
# y = "Number of Characters")+
theme_bw()
#ggplotly(p1)
words<-strsplit(Arabic$text," ")
Arabic$words<-lengths(words)
#summary(words)
ggplot(data = Arabic, aes(x=surat_id, y = words,color=manzil_id)) +
geom_point(alpha=0.1,aes(size=numberinsurat))+
#geom_line(alpha=0.3)+
#facet_wrap(facets = vars(manzil_id))+
#labs(title = "All Ayahs",
# x = "Ayat No",
# y = "Number of Characters")+
theme_bw()
AllWords<-unlist(words)
str(AllWords)
## chr [1:82823] "بسم" "الله" "الرحمن" "الرحيم" "الحمد" "لله" "رب" "العالمين" ...
concordance<-sort(table(AllWords),decreasing = TRUE)
plot(concordance[1:15])